5 Host genomics
5.1 Host DNA fraction
5.1.1 Data overview
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(sample_type) %>%
summarise(mean=mean(host_percentage, na.rm=T),sd=sd(host_percentage, na.rm=T)) %>%
tt()| sample_type | mean | sd |
|---|---|---|
| Anal/cloacal swab | 75.76413 | 32.05899 |
| Faecal | 22.94532 | 32.09473 |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(tax_group) %>%
summarise(mean=mean(host_percentage, na.rm=T),sd=sd(host_percentage, na.rm=T)) %>%
tt()| tax_group | mean | sd |
|---|---|---|
| Amphibians | 0.2968512 | 1.367301 |
| Bats | 49.4004705 | 37.779131 |
| Birds | 58.5039633 | 38.603463 |
| Mammals | 29.5029544 | 36.464343 |
| Reptiles | 12.4971018 | 22.008872 |
5.1.2 Statistical test
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(host_percentage ~ sample_type + tax_group, data = .) %>%
anova() %>%
tidy()# A tibble: 3 × 6
term df sumsq meansq statistic p.value
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 sample_type 1 798510. 798510. 991. 2.27e-177
2 tax_group 4 456840. 114210. 142. 6.34e-107
3 Residuals 2019 1626206. 805. NA NA
5.1.3 Plot
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=host_percentage, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
geom_jitter(alpha = 0.2, width=0.3) +
geom_boxplot(outlier.shape = NA) +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
theme_classic() +
facet_grid(~sample_type) +
labs(y="Host percentage", color="Taxa", fill="Taxa") +
theme_classic()left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=host_percentage, x=sample_type, group=sample_type)) +
stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
theme_classic() +
labs(y="Host percentage", color="Taxa", fill="Taxa") +
theme_classic()
ggsave("figures/hostdata_taxa_all.pdf",width=9, height=4, units="in")5.2 Genome depth
5.2.1 Data overview
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(sample_type) %>%
summarise(mean=mean(depth, na.rm=T),sd=sd(depth, na.rm=T)) %>%
tt()| sample_type | mean | sd |
|---|---|---|
| Anal/cloacal swab | 2.4220586 | 2.079382 |
| Faecal | 0.6573435 | 1.628490 |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(tax_group) %>%
summarise(mean=mean(depth, na.rm=T),sd=sd(depth, na.rm=T)) %>%
tt()| tax_group | mean | sd |
|---|---|---|
| Amphibians | 0.001921062 | 0.01314311 |
| Bats | 1.246383229 | 1.39021316 |
| Birds | 2.478493853 | 3.08305053 |
| Mammals | 0.595230910 | 1.27474674 |
| Reptiles | 0.345241814 | 0.60682463 |
5.2.2 Statistical test
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(depth ~ sample_type + tax_group, data = .) %>%
anova() %>%
tidy()# A tibble: 3 × 6
term df sumsq meansq statistic p.value
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 sample_type 1 893. 893. 349. 6.07e-72
2 tax_group 4 772. 193. 75.3 1.28e-59
3 Residuals 2035 5211. 2.56 NA NA
5.2.3 Plot
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=depth, x=sample_type, color=sample_type, fill=sample_type, group=sample_type)) +
ylim(0,10)+
geom_boxplot(outlier.shape = NA) +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
theme_classic() +
labs(y="Host depth of coverage", color="Taxa", fill="Taxa") +
theme_classic()